{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 用户和活动关联关系处理\n",
    "\n",
    "\n",
    "整个数据集中活动数目（events.csv）太多，所以下面的处理我们找出只在训练集和测试集中出现的活动和用户集合，并对他们重新编制索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存数据\n",
    "import cPickle\n",
    "import pandas as pd\n",
    "import itertools\n",
    "\n",
    "#处理事件字符串\n",
    "import datetime\n",
    "\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd\n",
    "\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n"
     ]
    }
   ],
   "source": [
    " \"\"\"\n",
    "我们只关心train和test中出现的user和event，因此重点处理这部分关联数据\n",
    "\n",
    "train.csv 有6列：\n",
    "user：用户ID\n",
    "event：活动ID\n",
    "invited：是否被邀请（0/1）\n",
    "timestamp：ISO-8601 UTC格式时间字符串，表示用户看到该活动的时间\n",
    "interested, and not_interested\n",
    "\n",
    "Test.csv 除了没有interested, and not_interested，其余列与train相同\n",
    " \"\"\"\n",
    "    \n",
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "    \n",
    "for filename in [\"train.csv\", \"test.csv\"]:\n",
    "    f = open(filename, 'rb')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    f.readline().strip().split(\",\")\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().split(\",\")\n",
    "        uniqueUsers.add(cols[0])   #第一列为用户ID\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "        \n",
    "        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动\n",
    "        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加\n",
    "    f.close()\n",
    "\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)\n",
    "\n",
    "#用户关系矩阵表，可用于后续LFM/SVD++处理的输入\n",
    "#这是一个稀疏矩阵，记录用户对活动感兴趣\n",
    "userEventScores = ss.dok_matrix((n_uniqueUsers, n_uniqueEvents))\n",
    "userIndex = dict()\n",
    "eventIndex = dict()\n",
    "\n",
    "#重新编码用户索引字典\n",
    "for i, u in enumerate(uniqueUsers):\n",
    "    userIndex[u] = i\n",
    "    \n",
    "#重新编码活动索引字典    \n",
    "for i, e in enumerate(uniqueEvents):\n",
    "    eventIndex[e] = i\n",
    "\n",
    "n_records = 0\n",
    "ftrain = open(\"train.csv\", 'rb')\n",
    "ftrain.readline()\n",
    "for line in ftrain:\n",
    "    cols = line.strip().split(\",\")\n",
    "    i = userIndex[cols[0]]  #用户\n",
    "    j = eventIndex[cols[1]] #活动\n",
    "    \n",
    "    eventsForUser[i].add(j)    #该用户参加了这个活动\n",
    "    usersForEvent[j].add(i)    #该活动被用户参加\n",
    "        \n",
    "    #userEventScores[i, j] = int(cols[4]) - int(cols[5])   #interested - not_interested\n",
    "    score = int(cols[4])\n",
    "    #if score == 0:  #0在稀疏矩阵中表示该元素不存在，因此借用-1表示interested=0\n",
    "    #userEventScores[i, j] = -1\n",
    "    #else:\n",
    "    userEventScores[i, j] = score\n",
    "ftrain.close()\n",
    "\n",
    "  \n",
    "##统计每个用户参加的活动，后续用于将用户朋友参加的活动影响到用户\n",
    "cPickle.dump(eventsForUser, open(\"PE_eventsForUser.pkl\", 'wb'))\n",
    "##统计活动参加的用户\n",
    "cPickle.dump(usersForEvent, open(\"PE_usersForEvent.pkl\", 'wb'))\n",
    "\n",
    "#保存用户-活动关系矩阵R，以备后用\n",
    "sio.mmwrite(\"PE_userEventScores\", userEventScores)\n",
    "\n",
    "\n",
    "#保存用户索引表\n",
    "cPickle.dump(userIndex, open(\"PE_userIndex.pkl\", 'wb'))\n",
    "#保存活动索引表\n",
    "cPickle.dump(eventIndex, open(\"PE_eventIndex.pkl\", 'wb'))\n",
    "\n",
    "    \n",
    "# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event\n",
    "# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair\n",
    "# 关联的event指的是至少同一个user有行为的event pair\n",
    "uniqueUserPairs = set()\n",
    "uniqueEventPairs = set()\n",
    "for event in uniqueEvents:\n",
    "    i = eventIndex[event]\n",
    "    users = usersForEvent[i]\n",
    "    if len(users) > 2:\n",
    "        uniqueUserPairs.update(itertools.combinations(users, 2))\n",
    "        \n",
    "for user in uniqueUsers:\n",
    "    u = userIndex[user]\n",
    "    events = eventsForUser[u]\n",
    "    if len(events) > 2:\n",
    "        uniqueEventPairs.update(itertools.combinations(events, 2))\n",
    " \n",
    "#保存用户-事件关系对索引表\n",
    "cPickle.dump(uniqueUserPairs, open(\"FE_uniqueUserPairs.pkl\", 'wb'))\n",
    "cPickle.dump(uniqueEventPairs, open(\"PE_uniqueEventPairs.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#训练集和测试集中出现的用户数目和事件数目远小于users.csv出现的用户数和events.csv出现的事件数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'433929966',\n",
       " '4234683959',\n",
       " '679447655',\n",
       " '773396729',\n",
       " '3822707736',\n",
       " '1588485860',\n",
       " '1465884852',\n",
       " '1968232351',\n",
       " '757000026',\n",
       " '2051799233',\n",
       " '447444721',\n",
       " '2301723344',\n",
       " '4028072926',\n",
       " '1935879458',\n",
       " '3109164411',\n",
       " '833858713',\n",
       " '186124210',\n",
       " '1180225343',\n",
       " '4016804175',\n",
       " '2726295050',\n",
       " '4266442206',\n",
       " '2313587840',\n",
       " '4141101486',\n",
       " '1458906695',\n",
       " '2450977738',\n",
       " '915187958',\n",
       " '407288007',\n",
       " '3768548263',\n",
       " '3051088984',\n",
       " '1933276938',\n",
       " '1770070953',\n",
       " '3965935592',\n",
       " '2812330223',\n",
       " '3323980704',\n",
       " '4049041156',\n",
       " '3696272381',\n",
       " '1281252091',\n",
       " '1319656469',\n",
       " '1872379613',\n",
       " '914775174',\n",
       " '2410772923',\n",
       " '230117918',\n",
       " '127163854',\n",
       " '1929917726',\n",
       " '2527738096',\n",
       " '417123290',\n",
       " '1499530182',\n",
       " '332142007',\n",
       " '4126236405',\n",
       " '1880617811',\n",
       " '106978589',\n",
       " '2850509209',\n",
       " '2507275690',\n",
       " '3770866498',\n",
       " '3349548561',\n",
       " '4096532695',\n",
       " '1607711017',\n",
       " '2043006756',\n",
       " '2606383328',\n",
       " '1977406305',\n",
       " '3172060382',\n",
       " '2646392315',\n",
       " '2912281746',\n",
       " '1907550340',\n",
       " '774093053',\n",
       " '3327557579',\n",
       " '3802650890',\n",
       " '2055922065',\n",
       " '3711909804',\n",
       " '393394477',\n",
       " '2896451394',\n",
       " '4201550847',\n",
       " '2474982115',\n",
       " '3805108320',\n",
       " '650745497',\n",
       " '79214950',\n",
       " '973265540',\n",
       " '806826749',\n",
       " '2840088969',\n",
       " '3700950547',\n",
       " '1000481836',\n",
       " '1959421009',\n",
       " '3217192264',\n",
       " '894829625',\n",
       " '4293596113',\n",
       " '2539102419',\n",
       " '3446267401',\n",
       " '3697638490',\n",
       " '2415873572',\n",
       " '169266244',\n",
       " '2422402842',\n",
       " '3523548061',\n",
       " '3946739090',\n",
       " '256821264',\n",
       " '1310511988',\n",
       " '2768392299',\n",
       " '345960905',\n",
       " '352751137',\n",
       " '265740872',\n",
       " '1880156476',\n",
       " '3179167671',\n",
       " '2818648337',\n",
       " '1771812765',\n",
       " '31456310',\n",
       " '4241501571',\n",
       " '1569316024',\n",
       " '4058800014',\n",
       " '3354141479',\n",
       " '682667459',\n",
       " '4261753049',\n",
       " '2078647608',\n",
       " '321356225',\n",
       " '1512828747',\n",
       " '598708806',\n",
       " '3161734222',\n",
       " '276320151',\n",
       " '915963444',\n",
       " '2877011901',\n",
       " '1073720066',\n",
       " '2188481598',\n",
       " '3197423454',\n",
       " '3976610702',\n",
       " '439311946',\n",
       " '2723158684',\n",
       " '2221840547',\n",
       " '2847152365',\n",
       " '2024105424',\n",
       " '2128371414',\n",
       " '3312755383',\n",
       " '4073832558',\n",
       " '3737966751',\n",
       " '4227274972',\n",
       " '228340462',\n",
       " '1561244898',\n",
       " '454719225',\n",
       " '2075203466',\n",
       " '533036818',\n",
       " '137465175',\n",
       " '1722911749',\n",
       " '2166119628',\n",
       " '2186351621',\n",
       " '3273736995',\n",
       " '4083994402',\n",
       " '272875209',\n",
       " '1868735086',\n",
       " '1203574711',\n",
       " '3571352784',\n",
       " '1158452780',\n",
       " '1043841273',\n",
       " '740148951',\n",
       " '2456710844',\n",
       " '234487772',\n",
       " '3038741300',\n",
       " '1177314523',\n",
       " '83427780',\n",
       " '1764893644',\n",
       " '3812153136',\n",
       " '1736406045',\n",
       " '3472113541',\n",
       " '2280892735',\n",
       " '376720958',\n",
       " '3033485944',\n",
       " '1860786983',\n",
       " '3413988347',\n",
       " '3275851610',\n",
       " '8559115',\n",
       " '1438024794',\n",
       " '3507025494',\n",
       " '1467809542',\n",
       " '3759330615',\n",
       " '3819567837',\n",
       " '2752772467',\n",
       " '3954432137',\n",
       " '3841578300',\n",
       " '317875697',\n",
       " '3523716576',\n",
       " '981244440',\n",
       " '3075319534',\n",
       " '38825896',\n",
       " '2103025461',\n",
       " '1232039292',\n",
       " '4031268561',\n",
       " '2464636842',\n",
       " '2512346948',\n",
       " '1203717384',\n",
       " '574261812',\n",
       " '390684326',\n",
       " '1827283730',\n",
       " '3594729381',\n",
       " '2042697619',\n",
       " '3453000167',\n",
       " '3949816728',\n",
       " '4043937476',\n",
       " '3037717443',\n",
       " '2877314442',\n",
       " '866923839',\n",
       " '1061745506',\n",
       " '655495803',\n",
       " '486447403',\n",
       " '1173365630',\n",
       " '2626284779',\n",
       " '738179988',\n",
       " '3884256656',\n",
       " '244220291',\n",
       " '3057888073',\n",
       " '3729283847',\n",
       " '1430932461',\n",
       " '3402209713',\n",
       " '675888033',\n",
       " '1915380946',\n",
       " '3534091101',\n",
       " '1824334392',\n",
       " '1696863642',\n",
       " '3329566428',\n",
       " '2762282302',\n",
       " '2118802492',\n",
       " '3549888444',\n",
       " '1549888368',\n",
       " '3273774165',\n",
       " '2598678508',\n",
       " '3926022702',\n",
       " '1633704318',\n",
       " '771048344',\n",
       " '3889211600',\n",
       " '978829373',\n",
       " '1374330612',\n",
       " '3996685669',\n",
       " '3400642235',\n",
       " '460661301',\n",
       " '1590675258',\n",
       " '2700038507',\n",
       " '984516743',\n",
       " '1758417603',\n",
       " '2235428011',\n",
       " '1282404404',\n",
       " '2029812325',\n",
       " '350873258',\n",
       " '975577539',\n",
       " '613668540',\n",
       " '138523229',\n",
       " '3175031640',\n",
       " '2560867051',\n",
       " '4219250662',\n",
       " '3892832138',\n",
       " '1873976153',\n",
       " '2867772846',\n",
       " '2363192851',\n",
       " '2329959118',\n",
       " '102026311',\n",
       " '709512451',\n",
       " '251657645',\n",
       " '1298016856',\n",
       " '3412307751',\n",
       " '3352558065',\n",
       " '513780850',\n",
       " '2472934939',\n",
       " '1462428678',\n",
       " '2859386475',\n",
       " '811652961',\n",
       " '1429589754',\n",
       " '1427394700',\n",
       " '1417587050',\n",
       " '750506418',\n",
       " '1537177508',\n",
       " '1810281079',\n",
       " '3167757146',\n",
       " '393416610',\n",
       " '3069899537',\n",
       " '2526925550',\n",
       " '536949855',\n",
       " '2106861684',\n",
       " '3729155908',\n",
       " '1111361846',\n",
       " '3267347122',\n",
       " '371746452',\n",
       " '1888695830',\n",
       " '1665697394',\n",
       " '1738181791',\n",
       " '518255514',\n",
       " '1963825952',\n",
       " '354333473',\n",
       " '647866667',\n",
       " '511362023',\n",
       " '505464566',\n",
       " '1255932343',\n",
       " '2993132089',\n",
       " '1739388067',\n",
       " '2232544430',\n",
       " '3041724882',\n",
       " '1985321109',\n",
       " '1471308455',\n",
       " '2272129750',\n",
       " '4010042708',\n",
       " '2134876228',\n",
       " '1993199919',\n",
       " '2832466526',\n",
       " '3703433044',\n",
       " '2961046023',\n",
       " '435473446',\n",
       " '2974043284',\n",
       " '3773409641',\n",
       " '2913860360',\n",
       " '55735397',\n",
       " '2422072300',\n",
       " '2458143482',\n",
       " '1913672917',\n",
       " '2156430059',\n",
       " '4055585686',\n",
       " '3667110496',\n",
       " '2691509364',\n",
       " '100417525',\n",
       " '1139116596',\n",
       " '377254812',\n",
       " '3801433102',\n",
       " '2157401626',\n",
       " '1711604318',\n",
       " '805963128',\n",
       " '3619266504',\n",
       " '1988291613',\n",
       " '1381535648',\n",
       " '2660205855',\n",
       " '268203907',\n",
       " '3404489024',\n",
       " '263279656',\n",
       " '166297284',\n",
       " '4244406355',\n",
       " '1118578975',\n",
       " '490590226',\n",
       " '1828726073',\n",
       " '2408267303',\n",
       " '3319862596',\n",
       " '171025395',\n",
       " '677713566',\n",
       " '61894853',\n",
       " '1712448138',\n",
       " '2407176743',\n",
       " '3657419629',\n",
       " '2007442218',\n",
       " '1063154747',\n",
       " '1366554501',\n",
       " '3574237758',\n",
       " '1295729468',\n",
       " '3851706723',\n",
       " '1988027954',\n",
       " '3036332464',\n",
       " '3365983510',\n",
       " '4089666658',\n",
       " '582672945',\n",
       " '3696948957',\n",
       " '1294755971',\n",
       " '2977586904',\n",
       " '232415600',\n",
       " '1592001933',\n",
       " '1656987460',\n",
       " '1781009055',\n",
       " '2670583521',\n",
       " '4116595591',\n",
       " '2865762984',\n",
       " '844299203',\n",
       " '135744766',\n",
       " '615488222',\n",
       " '4242816413',\n",
       " '650456731',\n",
       " '2851625619',\n",
       " '322679569',\n",
       " '2653321720',\n",
       " '4092689670',\n",
       " '274265388',\n",
       " '1309574084',\n",
       " '1218003893',\n",
       " '3935738455',\n",
       " '2796594102',\n",
       " '495818697',\n",
       " '407061424',\n",
       " '3892571955',\n",
       " '1930033127',\n",
       " '526306322',\n",
       " '794450376',\n",
       " '3144013743',\n",
       " '2448320909',\n",
       " '1695480317',\n",
       " '3407311067',\n",
       " '122289878',\n",
       " '3536688266',\n",
       " '2750873665',\n",
       " '3051804335',\n",
       " '3843631351',\n",
       " '3890406080',\n",
       " '3434513569',\n",
       " '1630243360',\n",
       " '1184535341',\n",
       " '1006903887',\n",
       " '1093614806',\n",
       " '2129718710',\n",
       " '1341096506',\n",
       " '1637427865',\n",
       " '1657604679',\n",
       " '1860511950',\n",
       " '1854380763',\n",
       " '2017624114',\n",
       " '1365361942',\n",
       " '146887909',\n",
       " '3221466050',\n",
       " '2794180407',\n",
       " '450183377',\n",
       " '553840202',\n",
       " '321597084',\n",
       " '679700922',\n",
       " '1889940073',\n",
       " '3512357259',\n",
       " '3890023938',\n",
       " '3617812300',\n",
       " '826832481',\n",
       " '1190152428',\n",
       " '2706390147',\n",
       " '1935315082',\n",
       " '2644800408',\n",
       " '2906696294',\n",
       " '525889178',\n",
       " '1033421355',\n",
       " '4172382349',\n",
       " '3021023405',\n",
       " '4072078077',\n",
       " '4205511631',\n",
       " '4264461482',\n",
       " '2844691787',\n",
       " '4070309332',\n",
       " '3610112479',\n",
       " '1750345998',\n",
       " '619666754',\n",
       " '3938234624',\n",
       " '2077865887',\n",
       " '1291141020',\n",
       " '737340986',\n",
       " '1686028847',\n",
       " '2190223358',\n",
       " '3855173861',\n",
       " '3318022618',\n",
       " '383607907',\n",
       " '3007406784',\n",
       " '3988717856',\n",
       " '582287969',\n",
       " '3119717504',\n",
       " '704244113',\n",
       " '2996288314',\n",
       " '3913698961',\n",
       " '536059045',\n",
       " '1351516910',\n",
       " '1363545588',\n",
       " '2588804869',\n",
       " '2409474647',\n",
       " '553125617',\n",
       " '1764172149',\n",
       " '2115776454',\n",
       " '1507489572',\n",
       " '4027073137',\n",
       " '4192978272',\n",
       " '4037816474',\n",
       " '4120806416',\n",
       " '2706955572',\n",
       " '51076977',\n",
       " '512403156',\n",
       " '3865518711',\n",
       " '2662850238',\n",
       " '3441655380',\n",
       " '1819998447',\n",
       " '3531332626',\n",
       " '3980181155',\n",
       " '643979828',\n",
       " '1640004842',\n",
       " '3289739345',\n",
       " '1948173913',\n",
       " '3434354867',\n",
       " '2384254802',\n",
       " '3402377961',\n",
       " '1647257656',\n",
       " '3183605169',\n",
       " '4156705844',\n",
       " '2928192183',\n",
       " '1077825961',\n",
       " '1187117667',\n",
       " '1527128646',\n",
       " '3501509607',\n",
       " '99226238',\n",
       " '1403176651',\n",
       " '2572117226',\n",
       " '4088363232',\n",
       " '1146130134',\n",
       " '2263694418',\n",
       " '3187969599',\n",
       " '410842839',\n",
       " '2828360569',\n",
       " '1807883016',\n",
       " '1179474743',\n",
       " '1099380892',\n",
       " '2971102319',\n",
       " '2944816123',\n",
       " '3268726556',\n",
       " '457253413',\n",
       " '3534827249',\n",
       " '2834364568',\n",
       " '3788504448',\n",
       " '1565011423',\n",
       " '3190717208',\n",
       " '4202744793',\n",
       " '3500532250',\n",
       " '2568221006',\n",
       " '1800619761',\n",
       " '2180353936',\n",
       " '1315709702',\n",
       " '2559164171',\n",
       " '3219983221',\n",
       " '3286516126',\n",
       " '866903362',\n",
       " '2370228216',\n",
       " '353850493',\n",
       " '3823594890',\n",
       " '2798408743',\n",
       " '1727806427',\n",
       " '326944617',\n",
       " '1237174145',\n",
       " '4000643494',\n",
       " '1245942866',\n",
       " '2671865684',\n",
       " '1955298811',\n",
       " '1904753074',\n",
       " '2274599788',\n",
       " '974024618',\n",
       " '1303299495',\n",
       " '914240358',\n",
       " '2178315774',\n",
       " '1819771813',\n",
       " '785309813',\n",
       " '3492796375',\n",
       " '519767743',\n",
       " '3075707957',\n",
       " '769827874',\n",
       " '2036167774',\n",
       " '4254934247',\n",
       " '58444357',\n",
       " '1005830738',\n",
       " '2608593001',\n",
       " '1387775604',\n",
       " '1977475739',\n",
       " '2444024386',\n",
       " '85675684',\n",
       " '3466325304',\n",
       " '3275948662',\n",
       " '2301324542',\n",
       " '3946619910',\n",
       " '787562214',\n",
       " '1329952040',\n",
       " '3089507532',\n",
       " '67648066',\n",
       " '3211869804',\n",
       " '1296215153',\n",
       " '2704376528',\n",
       " '643535605',\n",
       " '3701320707',\n",
       " '3129396339',\n",
       " '1889909673',\n",
       " '3931717290',\n",
       " '896485186',\n",
       " '1824198418',\n",
       " '1829771929',\n",
       " '1515079649',\n",
       " '526347239',\n",
       " '1407801833',\n",
       " '2084337326',\n",
       " '3391397881',\n",
       " '1551306530',\n",
       " '4276521055',\n",
       " '1439724705',\n",
       " '1700908753',\n",
       " '1783630773',\n",
       " '1605832718',\n",
       " '2071994542',\n",
       " '468922992',\n",
       " '2614611034',\n",
       " '3105234108',\n",
       " '2883057132',\n",
       " '2422202663',\n",
       " '1209548026',\n",
       " '1955143671',\n",
       " '1872585098',\n",
       " '1793588465',\n",
       " '3636792221',\n",
       " '2020109352',\n",
       " '4086500156',\n",
       " '4112668226',\n",
       " '374048503',\n",
       " '830183321',\n",
       " '2689644306',\n",
       " '2808705599',\n",
       " '3722242006',\n",
       " '2012765486',\n",
       " '1044785443',\n",
       " '1261557576',\n",
       " '328882352',\n",
       " '1189005464',\n",
       " '1281335602',\n",
       " '1064284965',\n",
       " '3022634670',\n",
       " '1953860425',\n",
       " '927693315',\n",
       " '2533071551',\n",
       " '2505738116',\n",
       " '3602218872',\n",
       " '693206007',\n",
       " '663617415',\n",
       " '2539026679',\n",
       " '3413478237',\n",
       " '3926217735',\n",
       " '2505330132',\n",
       " '832277631',\n",
       " '3622325352',\n",
       " '3200581311',\n",
       " '2053603846',\n",
       " '2704232594',\n",
       " '1582270949',\n",
       " '4125204455',\n",
       " '3981836848',\n",
       " '3855347794',\n",
       " '939509575',\n",
       " '3754239356',\n",
       " '1545607295',\n",
       " '1163805498',\n",
       " '2650490227',\n",
       " '3017293541',\n",
       " '2011450028',\n",
       " '3231944809',\n",
       " '3935076467',\n",
       " '432884667',\n",
       " '3341857073',\n",
       " '3113660150',\n",
       " '3008504566',\n",
       " '852324847',\n",
       " '1607531015',\n",
       " '1351006732',\n",
       " '1381165425',\n",
       " '1274017391',\n",
       " '3425996117',\n",
       " '1986416649',\n",
       " '1153175316',\n",
       " '787628032',\n",
       " '2028042509',\n",
       " '3404987729',\n",
       " '3067996970',\n",
       " '263011441',\n",
       " '1103896952',\n",
       " '1883524093',\n",
       " '3034841234',\n",
       " '474449136',\n",
       " '289899674',\n",
       " '3225161223',\n",
       " '788806703',\n",
       " '947107369',\n",
       " '1582951351',\n",
       " '4265093803',\n",
       " '1817434960',\n",
       " '832111597',\n",
       " '2219392559',\n",
       " '2384973677',\n",
       " '2118368309',\n",
       " '3809205963',\n",
       " '3430641796',\n",
       " '3126297081',\n",
       " '1844176380',\n",
       " '4110062919',\n",
       " '2993747793',\n",
       " '878068450',\n",
       " '2424637456',\n",
       " '956438727',\n",
       " '590488063',\n",
       " '3621854275',\n",
       " '268174971',\n",
       " '4220792229',\n",
       " '293751558',\n",
       " '1395640727',\n",
       " '3319506135',\n",
       " '322709533',\n",
       " '2368911433',\n",
       " '964120039',\n",
       " '4099353109',\n",
       " '1742102536',\n",
       " '1359556127',\n",
       " '1271755958',\n",
       " '1018819627',\n",
       " '3580442023',\n",
       " '4232519602',\n",
       " '1032588622',\n",
       " '1342847040',\n",
       " '2610561518',\n",
       " '4002881841',\n",
       " '1023712594',\n",
       " '2509552492',\n",
       " '3629737683',\n",
       " '561449801',\n",
       " '945869322',\n",
       " '2179267582',\n",
       " '1359749784',\n",
       " '2766617713',\n",
       " '2859654056',\n",
       " '845902488',\n",
       " '760916752',\n",
       " '705007381',\n",
       " '3816284328',\n",
       " '1764411835',\n",
       " '1945597998',\n",
       " '4104785185',\n",
       " '1987345098',\n",
       " '1143203761',\n",
       " '3906291725',\n",
       " '1750885159',\n",
       " '3171268449',\n",
       " '176859368',\n",
       " '2839635887',\n",
       " '582041333',\n",
       " '2391976137',\n",
       " '2424074793',\n",
       " '1023138400',\n",
       " '426166781',\n",
       " '249760834',\n",
       " '3222871469',\n",
       " '3834889020',\n",
       " '3482282242',\n",
       " '1255629030',\n",
       " '3501001868',\n",
       " '2802110996',\n",
       " '1807182727',\n",
       " '1340479010',\n",
       " '172405986',\n",
       " '3548928882',\n",
       " '1062024228',\n",
       " '3950787980',\n",
       " '3635358150',\n",
       " '1389265162',\n",
       " '547383999',\n",
       " '3738793886',\n",
       " '3656008998',\n",
       " '3842924038',\n",
       " '3658165839',\n",
       " '4188214514',\n",
       " '1198319209',\n",
       " '4220107080',\n",
       " '3008408180',\n",
       " '1572089933',\n",
       " '393162138',\n",
       " '4051353547',\n",
       " '1658768128',\n",
       " '4254164270',\n",
       " '1506378274',\n",
       " '743289246',\n",
       " '1596190412',\n",
       " '471488113',\n",
       " '1327533901',\n",
       " '750863544',\n",
       " '2806794871',\n",
       " '2893435883',\n",
       " '24645215',\n",
       " '35091389',\n",
       " '2184211588',\n",
       " '1703679230',\n",
       " '610221734',\n",
       " '3284750825',\n",
       " '3326401128',\n",
       " '885971279',\n",
       " '2692070381',\n",
       " '1690154685',\n",
       " '3479738412',\n",
       " '899213418',\n",
       " '663120398',\n",
       " '1764159783',\n",
       " '1620415785',\n",
       " '3579174462',\n",
       " '1079899197',\n",
       " '2554360190',\n",
       " '316263527',\n",
       " '3700378878',\n",
       " '2567308538',\n",
       " '1012343640',\n",
       " '705138890',\n",
       " '1142967652',\n",
       " '2943640347',\n",
       " '1685190005',\n",
       " '3803531290',\n",
       " '205658920',\n",
       " '3436633625',\n",
       " '679023125',\n",
       " '2554361988',\n",
       " '3250536690',\n",
       " '2062625787',\n",
       " '918116457',\n",
       " '728126986',\n",
       " '816912378',\n",
       " '2460957668',\n",
       " '3632072502',\n",
       " '2153348068',\n",
       " '3304261393',\n",
       " '3847927617',\n",
       " '3800668884',\n",
       " '2409838112',\n",
       " '3671450770',\n",
       " '3123256869',\n",
       " '940376988',\n",
       " '1430458652',\n",
       " '667664881',\n",
       " '2925109008',\n",
       " '688975255',\n",
       " '2963144429',\n",
       " '1036961928',\n",
       " '1395200630',\n",
       " '951253136',\n",
       " '2209898896',\n",
       " '3926229879',\n",
       " '1122283794',\n",
       " '3628957138',\n",
       " '1850184598',\n",
       " '3161434996',\n",
       " '1711893150',\n",
       " '3547653054',\n",
       " '1274527630',\n",
       " '61104529',\n",
       " '2840642149',\n",
       " '3155836085',\n",
       " '2063976566',\n",
       " '826688620',\n",
       " '3330051251',\n",
       " '872878580',\n",
       " '312581825',\n",
       " '861989233',\n",
       " '3009383497',\n",
       " '1120182267',\n",
       " '3863651111',\n",
       " '1486919469',\n",
       " '4228109405',\n",
       " '1960789592',\n",
       " '2703174925',\n",
       " '477591068',\n",
       " '63726188',\n",
       " '2688159351',\n",
       " '3369636425',\n",
       " '2534729330',\n",
       " '87840033',\n",
       " '1730501944',\n",
       " '4235863537',\n",
       " '2454980175',\n",
       " '300126317',\n",
       " '578037803',\n",
       " '524756826',\n",
       " '646715160',\n",
       " '4266815563',\n",
       " '4256067371',\n",
       " '721082024',\n",
       " '1294433853',\n",
       " '2254075965',\n",
       " '256505275',\n",
       " '1170318717',\n",
       " '468069570',\n",
       " '3191533412',\n",
       " '3321517547',\n",
       " '2412119621',\n",
       " '626503343',\n",
       " '1400053177',\n",
       " '1170338316',\n",
       " '1385126312',\n",
       " '1397996487',\n",
       " '3429958607',\n",
       " '2137154383',\n",
       " '1803275543',\n",
       " '1454534917',\n",
       " '744858144',\n",
       " '1757362667',\n",
       " '2976928477',\n",
       " '2437671297',\n",
       " '3453997003',\n",
       " '2007279414',\n",
       " '473090869',\n",
       " '1020530176',\n",
       " '3174177804',\n",
       " '1117746020',\n",
       " '181332418',\n",
       " '536196786',\n",
       " '493625429',\n",
       " '3054051090',\n",
       " '2269626530',\n",
       " '2159444507',\n",
       " '974085606',\n",
       " '2064292569',\n",
       " '1396446223',\n",
       " '1310931838',\n",
       " '1669367540',\n",
       " '2758697837',\n",
       " '726662409',\n",
       " '3432819058',\n",
       " '2279381833',\n",
       " '4004100709',\n",
       " '3684919275',\n",
       " '1769487166',\n",
       " '815715465',\n",
       " '2088284484',\n",
       " '1554642353',\n",
       " '3681999765',\n",
       " '1812117472',\n",
       " '64442250',\n",
       " '692075109',\n",
       " '3080242563',\n",
       " '877202695',\n",
       " '4189303160',\n",
       " '1214605379',\n",
       " '44607617',\n",
       " '1696929787',\n",
       " '1506510838',\n",
       " '2234892318',\n",
       " '317758728',\n",
       " '2505215665',\n",
       " '54535450',\n",
       " '1880608957',\n",
       " '4181300264',\n",
       " '3416395266',\n",
       " '2490438703',\n",
       " '2021591040',\n",
       " '4215173213',\n",
       " '4100237668',\n",
       " '3887323073',\n",
       " '1951237429',\n",
       " '3129233779',\n",
       " '3080864770',\n",
       " '1282392038',\n",
       " '2491530958',\n",
       " '3765974734',\n",
       " '2518430453',\n",
       " '3406035843',\n",
       " '604019700',\n",
       " '2213514822',\n",
       " '1094725860',\n",
       " '2714036762',\n",
       " '3045891165',\n",
       " '1587680670',\n",
       " '1287884858',\n",
       " '3099686443',\n",
       " '565086586',\n",
       " '1386545389',\n",
       " '3123645626',\n",
       " '887203538',\n",
       " '3864658857',\n",
       " '143166460',\n",
       " '3794166180',\n",
       " '4267881656',\n",
       " '335083853',\n",
       " '2038842201',\n",
       " '1720077487',\n",
       " '2585162162',\n",
       " '3097220122',\n",
       " '1985153611',\n",
       " '4231641706',\n",
       " '1633263987',\n",
       " '2253311961',\n",
       " '1704192379',\n",
       " '3235943764',\n",
       " '791860627',\n",
       " '1256540913',\n",
       " '361295277',\n",
       " '682567291',\n",
       " '3949409640',\n",
       " '1702401336',\n",
       " '3237640011',\n",
       " '1602227394',\n",
       " '383563223',\n",
       " '252821798',\n",
       " '1355891623',\n",
       " '2393885244',\n",
       " '943870191',\n",
       " '1407659332',\n",
       " '846069276',\n",
       " '2363524424',\n",
       " '1044854627',\n",
       " '1565715575',\n",
       " '110357109',\n",
       " '1380051674',\n",
       " '2780690728',\n",
       " '396698671',\n",
       " '1396454425',\n",
       " '733567972',\n",
       " '1272961219',\n",
       " '316055690',\n",
       " '398208897',\n",
       " '1118969585',\n",
       " '1855529308',\n",
       " '613851355',\n",
       " '1516820536',\n",
       " '3731778741',\n",
       " '480523741',\n",
       " '939563341',\n",
       " '2145223158',\n",
       " '3655295959',\n",
       " '2452291959',\n",
       " '791187416',\n",
       " '3039309993',\n",
       " '2157199483',\n",
       " ...}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniqueEvents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "events = pd.read_csv('events.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state  zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "\n",
       "   lat  lng  c_1   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  c_99  \\\n",
       "0  NaN  NaN    2   ...        0     1     0     0     0     0     0     0   \n",
       "1  NaN  NaN    2   ...        0     0     0     0     0     0     0     0   \n",
       "2  NaN  NaN    0   ...        0     0     0     0     0     0     0     0   \n",
       "3  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "4  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "\n",
       "   c_100  c_other  \n",
       "0      0        9  \n",
       "1      0        7  \n",
       "2      0       12  \n",
       "3      0        8  \n",
       "4      0        9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3137972 entries, 0 to 3137971\n",
      "Columns: 110 entries, event_id to c_other\n",
      "dtypes: float64(2), int64(103), object(5)\n",
      "memory usage: 2.6+ GB\n"
     ]
    }
   ],
   "source": [
    "events.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "enents_in_tt = events[events['event_id'].isin(uniqueEvents)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state  zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "\n",
       "   lat  lng  c_1   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  c_99  \\\n",
       "0  NaN  NaN    2   ...        0     1     0     0     0     0     0     0   \n",
       "1  NaN  NaN    2   ...        0     0     0     0     0     0     0     0   \n",
       "2  NaN  NaN    0   ...        0     0     0     0     0     0     0     0   \n",
       "3  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "4  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "\n",
       "   c_100  c_other  \n",
       "0      0        9  \n",
       "1      0        7  \n",
       "2      0       12  \n",
       "3      0        8  \n",
       "4      0        9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enents_in_tt.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 13418 entries, 0 to 3137701\n",
      "Columns: 110 entries, event_id to c_other\n",
      "dtypes: float64(2), int64(103), object(5)\n",
      "memory usage: 11.4+ MB\n"
     ]
    }
   ],
   "source": [
    "enents_in_tt.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>c_2</th>\n",
       "      <th>c_3</th>\n",
       "      <th>c_4</th>\n",
       "      <th>c_5</th>\n",
       "      <th>c_6</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1.341800e+04</td>\n",
       "      <td>1.341800e+04</td>\n",
       "      <td>8062.000000</td>\n",
       "      <td>8062.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "      <td>13418.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>2.140873e+09</td>\n",
       "      <td>2.134713e+09</td>\n",
       "      <td>25.727517</td>\n",
       "      <td>-24.807209</td>\n",
       "      <td>2.359964</td>\n",
       "      <td>1.464972</td>\n",
       "      <td>1.323372</td>\n",
       "      <td>0.888732</td>\n",
       "      <td>1.159711</td>\n",
       "      <td>2.479654</td>\n",
       "      <td>...</td>\n",
       "      <td>0.064913</td>\n",
       "      <td>0.083992</td>\n",
       "      <td>0.093755</td>\n",
       "      <td>0.070502</td>\n",
       "      <td>0.082427</td>\n",
       "      <td>0.233790</td>\n",
       "      <td>0.082874</td>\n",
       "      <td>0.076837</td>\n",
       "      <td>0.073558</td>\n",
       "      <td>57.554777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.232469e+09</td>\n",
       "      <td>1.254357e+09</td>\n",
       "      <td>21.162472</td>\n",
       "      <td>91.900619</td>\n",
       "      <td>19.331141</td>\n",
       "      <td>2.959769</td>\n",
       "      <td>2.720104</td>\n",
       "      <td>1.972209</td>\n",
       "      <td>15.695718</td>\n",
       "      <td>7.375475</td>\n",
       "      <td>...</td>\n",
       "      <td>0.309890</td>\n",
       "      <td>0.377730</td>\n",
       "      <td>0.388404</td>\n",
       "      <td>0.312148</td>\n",
       "      <td>0.503164</td>\n",
       "      <td>15.553234</td>\n",
       "      <td>0.356777</td>\n",
       "      <td>0.455338</td>\n",
       "      <td>0.337954</td>\n",
       "      <td>110.916584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.040700e+05</td>\n",
       "      <td>1.329876e+06</td>\n",
       "      <td>-86.151000</td>\n",
       "      <td>-157.991000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.081551e+09</td>\n",
       "      <td>1.027696e+09</td>\n",
       "      <td>3.608000</td>\n",
       "      <td>-96.886500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>14.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2.122509e+09</td>\n",
       "      <td>2.150758e+09</td>\n",
       "      <td>34.040000</td>\n",
       "      <td>-76.794000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>38.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>3.206782e+09</td>\n",
       "      <td>3.220623e+09</td>\n",
       "      <td>42.983750</td>\n",
       "      <td>98.656750</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>75.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>4.294677e+09</td>\n",
       "      <td>4.294033e+09</td>\n",
       "      <td>61.498000</td>\n",
       "      <td>174.777000</td>\n",
       "      <td>2186.000000</td>\n",
       "      <td>82.000000</td>\n",
       "      <td>85.000000</td>\n",
       "      <td>71.000000</td>\n",
       "      <td>1801.000000</td>\n",
       "      <td>306.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>1801.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>9664.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 105 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           event_id       user_id          lat          lng           c_1  \\\n",
       "count  1.341800e+04  1.341800e+04  8062.000000  8062.000000  13418.000000   \n",
       "mean   2.140873e+09  2.134713e+09    25.727517   -24.807209      2.359964   \n",
       "std    1.232469e+09  1.254357e+09    21.162472    91.900619     19.331141   \n",
       "min    1.040700e+05  1.329876e+06   -86.151000  -157.991000      0.000000   \n",
       "25%    1.081551e+09  1.027696e+09     3.608000   -96.886500      0.000000   \n",
       "50%    2.122509e+09  2.150758e+09    34.040000   -76.794000      1.000000   \n",
       "75%    3.206782e+09  3.220623e+09    42.983750    98.656750      3.000000   \n",
       "max    4.294677e+09  4.294033e+09    61.498000   174.777000   2186.000000   \n",
       "\n",
       "                c_2           c_3           c_4           c_5           c_6  \\\n",
       "count  13418.000000  13418.000000  13418.000000  13418.000000  13418.000000   \n",
       "mean       1.464972      1.323372      0.888732      1.159711      2.479654   \n",
       "std        2.959769      2.720104      1.972209     15.695718      7.375475   \n",
       "min        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "25%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "50%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "75%        2.000000      2.000000      1.000000      1.000000      2.000000   \n",
       "max       82.000000     85.000000     71.000000   1801.000000    306.000000   \n",
       "\n",
       "           ...               c_92          c_93          c_94          c_95  \\\n",
       "count      ...       13418.000000  13418.000000  13418.000000  13418.000000   \n",
       "mean       ...           0.064913      0.083992      0.093755      0.070502   \n",
       "std        ...           0.309890      0.377730      0.388404      0.312148   \n",
       "min        ...           0.000000      0.000000      0.000000      0.000000   \n",
       "25%        ...           0.000000      0.000000      0.000000      0.000000   \n",
       "50%        ...           0.000000      0.000000      0.000000      0.000000   \n",
       "75%        ...           0.000000      0.000000      0.000000      0.000000   \n",
       "max        ...           7.000000      9.000000     10.000000      9.000000   \n",
       "\n",
       "               c_96          c_97          c_98          c_99         c_100  \\\n",
       "count  13418.000000  13418.000000  13418.000000  13418.000000  13418.000000   \n",
       "mean       0.082427      0.233790      0.082874      0.076837      0.073558   \n",
       "std        0.503164     15.553234      0.356777      0.455338      0.337954   \n",
       "min        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "25%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "50%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "75%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "max       23.000000   1801.000000      9.000000     16.000000      7.000000   \n",
       "\n",
       "            c_other  \n",
       "count  13418.000000  \n",
       "mean      57.554777  \n",
       "std      110.916584  \n",
       "min        0.000000  \n",
       "25%       14.000000  \n",
       "50%       38.000000  \n",
       "75%       75.000000  \n",
       "max     9664.000000  \n",
       "\n",
       "[8 rows x 105 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enents_in_tt.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "415464198     265\n",
       "4236892345     25\n",
       "2819832612     17\n",
       "4006139556     15\n",
       "1959314866     12\n",
       "3886178269     12\n",
       "3318624521     12\n",
       "1999363021     12\n",
       "3591486701     11\n",
       "3010499707     10\n",
       "2758197166     10\n",
       "1533531021      9\n",
       "3155750308      9\n",
       "3286716293      9\n",
       "1931400789      9\n",
       "2713964514      9\n",
       "2921397339      9\n",
       "3302780718      9\n",
       "2328001055      8\n",
       "3827598704      8\n",
       "314669641       8\n",
       "1897020031      8\n",
       "3934954183      8\n",
       "2610785821      8\n",
       "2200447857      8\n",
       "1679899178      7\n",
       "2294440380      7\n",
       "3566631920      7\n",
       "4235478695      7\n",
       "3639934255      7\n",
       "             ... \n",
       "3413862856      1\n",
       "1037291975      1\n",
       "2477809491      1\n",
       "965075958       1\n",
       "653775234       1\n",
       "2956653609      1\n",
       "2471115202      1\n",
       "2768755375      1\n",
       "2628340180      1\n",
       "3301020119      1\n",
       "2269647343      1\n",
       "4012009608      1\n",
       "2365107852      1\n",
       "1125925357      1\n",
       "3494697533      1\n",
       "3989132412      1\n",
       "2107293087      1\n",
       "1803637252      1\n",
       "4109205703      1\n",
       "2472792551      1\n",
       "1787042278      1\n",
       "160154085       1\n",
       "1172028568      1\n",
       "3587170037      1\n",
       "2179611106      1\n",
       "483416542       1\n",
       "475070941       1\n",
       "2980032987      1\n",
       "541784538       1\n",
       "2422234394      1\n",
       "Name: user_id, Length: 11453, dtype: int64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(enents_in_tt.user_id, sort = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "enents_in_tt.to_csv('enents_in_train&test.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
